Here is the report of the network analysis of the crop health data.
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## Loading required package: reshape
##
## Attaching package: 'reshape'
##
## The following objects are masked from 'package:plyr':
##
## rename, round_any
##
## The following object is masked from 'package:dplyr':
##
## rename
##
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
##
## The following objects are masked from 'package:reshape':
##
## colsplit, melt, recast
##
##
## Attaching package: 'lubridate'
##
## The following object is masked from 'package:reshape':
##
## stamp
##
## The following object is masked from 'package:plyr':
##
## here
##
## Loading required package: survival
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist, unsplit
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.3-1
##
## Attaching package: 'igraph'
##
## The following object is masked from 'package:vegan':
##
## diversity
##
## The following object is masked from 'package:permute':
##
## permute
##
## The following objects are masked from 'package:BiocGenerics':
##
## normalize, union
##
## The following objects are masked from 'package:lubridate':
##
## %--%, union
##
## The following objects are masked from 'package:dplyr':
##
## %>%, as_data_frame, groups, union
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
Load the survey data
library(RCurl)
## Loading required package: bitops
file <- getURL("https://docs.google.com/spreadsheets/d/1zB7gNdI7Nk7SuHuPWcjzaKnjuwkvL6sOVMo0zMfuV-c/pub?gid=558862364&single=true&output=csv")
data <- read.csv(text = file)
# save(data, file = "manuscript1/data/skep1data.RData")
#load(file = "manuscript1/data/skep1data.RData")
#Filepath <- "~/Google Drive/1.SKEP1/SKEP1survey.xls"
#data <- readWorksheetFromFile(Filepath, sheet = 1)
data[data == "-"] <- NA # replace '-' with NA
data[data == ""] <- NA # replace 'missing data' with NA
#==== to lower variable names ====
names(data) <- tolower(names(data)) # for more consistancy
select out the column which is not inlcuded in the analysis
data$phase <- NULL # there is only one type yype of phase in the survey
data$identifier <- NULL # this variable is not included in the analysis
data$village <- NULL
data$year <- NULL
data$season <- NULL
data$lat <- NULL
data$long <- NULL
data$fa <- NULL # field area is not include in the analysis
data$fn <- NULL # farmer name can not be included in this survey analysis
data$fp <- NULL # I do not know what is fp
data$lfm <- NULL # there is only one type of land form in this survey
data$ced <- NULL # Date data can not be included in the network analysis
data$cedjul <- NULL
data$hd <- NULL # Date data can not be included in the network analysis
data$hdjul <- NULL
data$cvr <- NULL
data$varcoded <- NULL # I will recode them
data$fymcoded <- NULL
data$mu <- NULL # no record
data$nplsqm <- NULL
data$rbpx <- NULL # no record
#==== corract the variable type =====
data <- transform(data,
country = as.factor(country),
pc = as.factor(pc),
cem = as.factor(cem),
ast = as.factor(ast),
ccd = as.numeric(ccd),
vartype = as.factor(vartype),
fym = as.character(fym),
n = as.numeric(n),
p = as.numeric(p) ,
k = as.numeric(k),
mf = as.numeric(mf),
wcp = as.factor(wcp),
iu = as.numeric(iu),
hu = as.numeric(hu),
fu = as.numeric(fu),
cs = as.factor(cs),
ldg = as.numeric(ldg),
yield = as.numeric(yield) ,
dscum = as.factor(dscum),
wecum = as.factor(wecum),
ntmax = as.numeric(ntmax),
npmax = as.numeric(npmax),
nltmax = as.numeric(nltmax),
nlhmax = as.numeric(nltmax),
waa = as.numeric(waa),
wba = as.numeric(wba) ,
dhx = as.numeric(dhx),
whx = as.numeric(whx),
ssx = as.numeric(ssx),
wma = as.numeric(wma),
lfa = as.numeric(lfa),
lma = as.numeric(lma),
rha = as.numeric(rha) ,
thrx = as.numeric(thrx),
pmx = as.numeric(pmx),
defa = as.numeric(defa) ,
bphx = as.numeric(bphx),
wbpx = as.numeric(wbpx),
awx = as.numeric(awx),
rbx =as.numeric(rbx),
rbbx = as.numeric(rbbx),
glhx = as.numeric(glhx),
stbx=as.numeric(stbx),
hbx= as.numeric(hbx),
bbx = as.numeric(bbx),
blba = as.numeric(blba),
lba = as.numeric(lba),
bsa = as.numeric(bsa),
blsa = as.numeric(blsa),
nbsa = as.numeric(nbsa),
rsa = as.numeric(rsa),
lsa = as.numeric(lsa),
shbx = as.numeric(shbx) ,
shrx = as.numeric(shrx),
srx= as.numeric(srx),
fsmx = as.numeric(fsmx),
nbx = as.numeric(nbx),
dpx = as.numeric(dpx),
rtdx = as.numeric(rtdx),
rsdx = as.numeric(rsdx),
gsdx =as.numeric(gsdx),
rtx = as.numeric(rtx)
)
data$pc <- ifelse(data$pc == "rice", 1, 0)
#Crop establisment method
levels(data$cem)[levels(data$cem) == "trp"] <- 1
levels(data$cem)[levels(data$cem) == "TPR"] <- 1
levels(data$cem)[levels(data$cem) == "DSR"] <- 2
levels(data$cem)[levels(data$cem) == "dsr"] <- 2
# fym there are two type 0 and 1, raw data are recorded as no, yes, and value, if the value is 0 which mean 0 and if the value more than 0 which means 1
data$fym <- ifelse(data$fym == "no", 0,
ifelse(data$fym == "0", 0, 1
)
)
# vartype there are three type treditional varieties, modern varities and hybrid
data$vartype <- ifelse(data$vartype == "tv", 1,
ifelse(data$vartype == "mv", 2,
ifelse(data$vartype == "hyb", 3, NA
)
)
)
# wcp weed control management
levels(data$wcp)[levels(data$wcp) == "hand"] <- 1
levels(data$wcp)[levels(data$wcp) == "herb"] <- 2
levels(data$wcp)[levels(data$wcp) == "herb-hand"] <- 3
# Crop Status
levels(data$cs)[levels(data$cs) == "very poor"] <- 1
levels(data$cs)[levels(data$cs) == "poor"] <- 2
levels(data$cs)[levels(data$cs) == "average"] <- 3
levels(data$cs)[levels(data$cs) == "good"] <- 4
levels(data$cs)[levels(data$cs) == "very good"] <- 5
#clean the data
num.data <- apply(data[, -c(1,2)], 2, as.numeric)
num.data <- as.data.frame(as.matrix(num.data))
data <- cbind(data[1:2], num.data)
data <- data[,apply(data[, -c(1,2)], 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0
data <- data[complete.cases(data),] # exclude row which cantain NA
#==== cluster analysis of the production sitatuon of the survey data ====
start.PS <- "pc"
end.PS <- "fu"
start.col.PS <- match(start.PS, names(data))
end.col.PS <- match(end.PS, names(data))
PS.data <- data[, c(1, start.col.PS:end.col.PS)]
# transform all variable to numeric type
# wss <- (nrow(PS.data)-1)* sum(apply(PS.data, 2, var))
# for (i in 2:15) wss[i] <- sum(kmeans(PS.data,
# centers=i)$withinss)
# plot(1:15, wss, type="b", xlab="Number of Clusters",
# ylab="Within groups sum of squares")
#distance matrix
dist.PS <- daisy(PS.data[-1])
## Warning in daisy(PS.data[-1]): binary variable(s) 1, 2, 6 treated as
## interval scaled
cluster.PS <- hclust(dist.PS, method = "average")
dendro.PS <- as.dendrogram(cluster.PS)
plot(dendro.PS, center = T, nodePar = list(lab.cex = 0.6,
lab.col = "black", pch = NA),
main = "Dendrogram for Production situation")
# draw retangles
rect.hclust(tree = cluster.PS, k=2, border = c("red", "blue"))
#number of members in each cluster
PS.no <- cutree(cluster.PS, k = 2)
# cophenitic correlation
rcluster.PS <- cophenetic(cluster.PS)
cor(dist.PS, rcluster.PS)
## [1] 0.7346866
data <- cbind(data, PS.no)
data$PS.no <- as.factor(data$PS.no)
name.country <- levels(data$country)
by.country.data <- list()
for(i in 1:length(name.country)){
temp <- data %>%
filter(country == name.country[i])
by.country.data[[i]] <- temp
}
g <- ggplot(data, aes(x = country)) +
geom_histogram(weights = count) +
stat_bin(geom = "text", aes(label = ..count..), vjust = 1.5, color = "white" ) +
scale_y_continuous(limits = c(0, 125), breaks = NULL ) +
theme(legend.position = "none") +
ggtitle("The no of observation of dataset in each country")
print(g)
## ymax not defined: adjusting position using y instead
par(mfrow=c(1, 2), las = 1)
for(i in 1:length(name.country)) {
g <- ggplot(by.country.data[[i]], aes(x = PS.no)) +
geom_histogram(weights = count) +
ggtitle(paste("Histogram PS cluster of", name.country[i])) +
theme(legend.position="none")
print(g)
}
# The profile of PS no
clus.PS.data <- data[, -c(1, 17:56)]
m.PS.data <- melt(clus.PS.data)
## Using country, PS.no as id variables
name.variable <- levels(m.PS.data$variable)
name.PS.no <- levels(m.PS.data$PS.no)
# check the profile of each cluster by histogram
for(i in 1:length(name.variable)) {
subtemp <- subset(m.PS.data, variable == name.variable[i])
for(j in 1: length(name.PS.no)){
subtemp1 <- subtemp %>% filter(PS.no == name.PS.no[j])
g <- ggplot(subtemp1, aes(x = value)) +
geom_bar(aes(y = (..count..)/sum(..count..)), binwidth = 1) +
scale_y_continuous(limits = c(0, 1), labels = percent) +
ylab("Percent") + xlab(paste(name.variable[i]))
ggtitle(paste("Histogram of", name.variable[i], "in PS. no", name.PS.no[j]))
print(g)
}
}
Network analysis
#head(data)
# select only the variables related to the injury profiles
start.IP <- "dhx"
end.IP <- "rtx"
start.col.IP <- match(start.IP, names(data))
end.col.IP <- match(end.IP, names(data))
IP.data <- data[start.col.IP:end.col.IP]
IP.data <- IP.data[ ,apply(IP.data, 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0
groups <- paste(data$country, data$PS.no, sep = "_")
IP.data <- cbind(groups, IP.data)
IP.data[is.na(IP.data)] <- 0
trts <- as.vector(unique(IP.data$groups))
#=====co_occurrence_pairwise.R====
results <- matrix(nrow = 0, ncol = 7)
options(warnings = -1)
for(a in 1:length(trts)){
#pull the first element from the vector of treatments
trt.temp <- trts[a]
#subset the dataset for those treatments
temp <- subset(IP.data, groups == trt.temp)
#in this case the community data started at column 6, so the loop for co-occurrence has to start at that point
for(b in 2:(dim(temp)[2]-1)){
#every species will be compared to every other species, so there has to be another loop that iterates down the rest of the columns
for(c in (b+1):(dim(temp)[2])){
#summing the abundances of species of the columns that will be compared
species1.ab <- sum(temp[,b])
species2.ab <- sum(temp[,c])
#if the column is all 0's no co-occurrence will be performed
if(species1.ab >1 & species2.ab >1){
test <- cor.test(temp[,b], temp[,c], method = "spearman", na.action = na.rm, exact = FALSE)
# There are warnings when setting exact = TRUE because of ties from the output of Spearman's correlation
# stackoverflow.com/questions/10711395/spear-man-correlation and ties
# It would be still valid if the data is not normally distributed.
rho <- test$estimate
p.value <- test$p.value
}
if(species1.ab <=1 | species2.ab <= 1){
rho<-0
p.value<-1
}
new.row <- c(trts[a], names(temp)[b], names(temp)[c], rho, p.value, species1.ab, species2.ab)
results <- rbind(results, new.row)
}
}
print(a/length(trts))
}
## [1] 0.1111111
## [1] 0.2222222
## [1] 0.3333333
## [1] 0.4444444
## [1] 0.5555556
## [1] 0.6666667
## [1] 0.7777778
## [1] 0.8888889
## [1] 1
results<-data.frame(data.matrix(results))
names(results)<-c("trt","taxa1","taxa2","rho","p.value","ab1","ab2")
#making sure certain variables are factors
results$trt <- as.factor(results$trt)
results$taxa1 <- as.character(as.factor(results$taxa1))
results$taxa2 <- as.character(as.factor(results$taxa2))
results$rho <- as.numeric(as.character(results$rho))
results$p.value <- as.numeric(as.character(results$p.value))
results$ab1 <- as.numeric(as.character(results$ab1))
results$ab2 <- as.numeric(as.character(results$ab2))
str(results)
## 'data.frame': 2484 obs. of 7 variables:
## $ trt : Factor w/ 9 levels "IDN_1","IDN_2",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ taxa1 : chr "dhx" "dhx" "dhx" "dhx" ...
## $ taxa2 : chr "whx" "ssx" "defa" "bphx" ...
## $ rho : num 0.0652 -0.1227 0.1899 -0.117 -0.5074 ...
## $ p.value: num 0.689188 0.45061 0.240542 0.472163 0.000832 ...
## $ ab1 : num 1307 1307 1307 1307 1307 ...
## $ ab2 : num 2089 109 1941 1286 84 ...
head(results)
## trt taxa1 taxa2 rho p.value ab1 ab2
## 1 PHL_1 dhx whx 0.06524012 0.6891879742 1307 2089
## 2 PHL_1 dhx ssx -0.12272044 0.4506097948 1307 109
## 3 PHL_1 dhx defa 0.18989475 0.2405423584 1307 1941
## 4 PHL_1 dhx bphx -0.11699678 0.4721633481 1307 1286
## 5 PHL_1 dhx wbpx -0.50743562 0.0008316712 1307 84
## 6 PHL_1 dhx awx 0.00000000 1.0000000000 1307 1
results_sub <- subset(results, as.numeric(as.character(abs(rho))) > 0.25)
results_sub.by.group <- list()
name.groups <- sort(as.vector(unique(groups)))
for(i in 1: length(name.groups)){
results_sub.by.group[[i]] <- subset(results_sub, trt == name.groups[i])
}
# head(results_sub.by.group[[1]][,2:3]) # get the list
g <- list()
for(i in 1:length(name.groups)){
g[[i]] <- graph.edgelist(as.matrix(results_sub.by.group[[i]][,2:3]), directed = FALSE)
#== adjust layout
l <- layout.fruchterman.reingold(g[[i]])
#== adjust vertices
V(g[[i]])$color <- "tomato"
V(g[[i]])$frame.color <- "gray40"
V(g[[i]])$shape <- "circle"
V(g[[i]])$size <- 25
V(g[[i]])$label.color <- "white"
V(g[[i]])$label.font <- 2
V(g[[i]])$label.family <- "Helvetica"
V(g[[i]])$label.cex <- 0.7
#== adjust the edge
E(g[[i]])$weight <- as.matrix(results_sub.by.group[[i]][, 4])
E(g[[i]])$width <- 1 + E(g[[i]])$weight*5
col <- c("firebrick3", "forestgreen")
colc <- cut(results_sub.by.group[[i]]$rho, breaks = c(-1, 0, 1), include.lowest = TRUE)
E(g[[i]])$color <- col[colc]
#== plot network model
plot(g[[i]], layout = l * 1.0, main = paste( "network model of", name.groups[i]))
}
matrics <- list()
for(i in 1:length(g)){
E(g[[i]])$weight <- as.matrix(abs(results_sub.by.group[[i]][, 4]))
matrics[[i]] <- data.frame(
deg = degree(g[[i]]), # degree
bet = betweenness(g[[i]]), # betweenness
clo = closeness(g[[i]]), # closeness
eig = evcent(g[[i]])$vector,# egin.cent
cor = graph.coreness(g[[i]]) # coreness
)
matrics[[i]]$res <- as.vector(lm(eig ~ bet, data = matrics[[i]])$residuals)
}
One method is to plot / regress eigenvector centrality on betweenness and examine the residue. - An actor with high betweenness and low eigenvector centrality may be an importanct gatkeeper to a central actor - An actor with low betweeness and hight eigenvector centrality may have unique access to central actor
for(i in 1:length(matrics)){
plot <- ggplot(
matrics[[i]], aes(x = bet, y = eig,
label = rownames(matrics[[i]]),
color = res,
size = abs(res))) +
xlab("Betweenness Centrality") +
ylab("Eigencvector Centrality") +
geom_text() +
ggtitle(paste("Key Actor Analysis for Injuiry Profiles of", name.groups[i]))
print(plot)
}